Number of papers over time

data source

We load the data from the Competence Centre for Bibliometrics: http://www.bibliometrie.info/. They licence access to the Web of Science and Scopus bibliometric databases, spanning a high proportion of all peer-reviewed research literature. The Competence Centre for Bibliometrics further processes both databases' data, so that it can be queried with SQL.

load libraries:


In [1]:
import cx_Oracle #ensure that OS, InstantClient (Basic, ODBC, SDK) and cx_Oracle are all 64 bit. Install with "pip install cx_Oracle". Add link to InstantClient in Path variable!
import pandas as pd
import re
import plotly.plotly as py
import plotly.graph_objs as go

set parameter


In [2]:
#parameter:
searchterm="big data" #lowecase!
colorlist=["#01be70","#586bd0","#c0aa12","#0183e6","#f69234","#0095e9","#bd8600","#007bbe","#bb7300","#63bcfc","#a84a00","#01bedb","#82170e","#00c586","#a22f1f","#3fbe57","#3e4681","#9bc246","#9a9eec","#778f00","#00aad9","#fc9e5e","#01aec1","#832c1e","#55c99a","#dd715b","#017c1c","#ff9b74","#009556","#83392a","#00b39b","#8e5500","#50a7c6","#f4a268","#02aca7","#532b00","#67c4bd","#5e5500","#f0a18f","#007229","#d2b073","#005d3f","#a5be6b","#2a4100","#8cb88c","#2f5c00","#007463","#5b7200","#787c48","#3b7600"]

load data from SQL database:


In [ ]:
dsn_tns=cx_Oracle.makedsn('127.0.0.1','6025',service_name='bibliodb01.fiz.karlsruhe') #due to licence requirements,
# access is only allowed for members of the competence center of bibliometric and cooperation partners. You can still 
# continue with the resulting csv below.
 #open connection:
db=cx_Oracle.connect(<username>, <password>, dsn_tns)
print(db.version)

In [ ]:
#%% define sql-query function:
def read_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute( query )
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchall()
        return pd.DataFrame( rows, columns=names)
    finally:
        if cursor is not None:
            cursor.close()

In [ ]:
#%% load paper titles from WOSdb:
database="wos_B_2016"          
            
command="""SELECT DISTINCT(ARTICLE_TITLE), PUBYEAR   
 FROM """+database+""".KEYWORDS, """+database+""".ITEMS_KEYWORDS, """+database+""".ITEMS 
 WHERE
 """+database+""".ITEMS_KEYWORDS.FK_KEYWORDS="""+database+""".KEYWORDS.PK_KEYWORDS
 AND """+database+""".ITEMS.PK_ITEMS="""+database+""".ITEMS_KEYWORDS.FK_ITEMS  
 AND (lower("""+database+""".KEYWORDS.KEYWORD) LIKE '%"""+searchterm+"""%' OR lower(ARTICLE_TITLE) LIKE '%"""+searchterm+"""%')
"""

dfWOS=read_query(db,command)
dfWOS['wos']=True #to make the source identifyable
dfWOS.to_csv("all_big_data_titles_year_wos.csv", sep=';')


#%% load paper titles from SCOPUSdb:
database="SCOPUS_B_2016"            
            
command="""SELECT DISTINCT(ARTICLE_TITLE), PUBYEAR  
 FROM """+database+""".KEYWORDS, """+database+""".ITEMS_KEYWORDS, """+database+""".ITEMS 
 WHERE
 """+database+""".ITEMS_KEYWORDS.FK_KEYWORDS="""+database+""".KEYWORDS.PK_KEYWORDS
 AND """+database+""".ITEMS.PK_ITEMS="""+database+""".ITEMS_KEYWORDS.FK_ITEMS  
 AND (lower("""+database+""".KEYWORDS.KEYWORD) LIKE '%"""+searchterm+"""%' OR lower(ARTICLE_TITLE) LIKE '%"""+searchterm+"""%')
"""

dfSCOPUS=read_query(db,command)
dfSCOPUS['scopus']=True #to make the source identifyable
dfSCOPUS.to_csv("all_big_data_titles_year_scopus.csv", sep=';')

#this takes some time, we will work with the exported CSV from here on

merging data


In [14]:
dfWOS=pd.read_csv("all_big_data_titles_year_wos.csv",sep=";")
dfSCOPUS=pd.read_csv("all_big_data_titles_year_scopus.csv",sep=";")

df=pd.merge(dfWOS,dfSCOPUS,on='ARTICLE_TITLE',how='outer')
#get PUBYEAR in one column:
df.loc[df['wos'] == 1, 'PUBYEAR_y'] = df['PUBYEAR_x']
#save resulting csv again:
df=df[['ARTICLE_TITLE','PUBYEAR_y','wos','scopus']]
df.to_csv("all_big_data_titles_with_year.csv", sep=';')
df


Out[14]:
ARTICLE_TITLE PUBYEAR_y wos scopus
0 Big Data with Cloud Computing: an insight on t... 2014.0 True NaN
1 Understanding Democracy and Development Traps ... 2015.0 True NaN
2 Psycho-Informatics: Big Data shaping modern ps... 2014.0 True NaN
3 Keywords co-occurrence mapping knowledge domai... 2015.0 True NaN
4 Introducing TPCx-HS: The First Industry Standa... 2015.0 True NaN
5 Application and Exploration of Big Data Mining... 2016.0 True NaN
6 Performance Evaluation of a Natural Language P... 2014.0 True NaN
7 Context-aware Task Allocation for Fast Paralle... 2014.0 True NaN
8 Improving China's Corporate Governance Within ... 2015.0 True NaN
9 Big Data and Predictive Analytics in ERP Syste... 2014.0 True NaN
10 Re-Stream: Real-time and energy-efficient reso... 2015.0 True True
11 BIG DATA IN SURVEY RESEARCH 2015.0 True NaN
12 Models and Data Sources Used in Systems Medici... 2016.0 True NaN
13 Big data and precision 2015.0 True True
14 IoT-Security approach analysis for the novel n... 2014.0 True NaN
15 A meeting report from the 2013 GARNet workshop... 2015.0 True NaN
16 Learning methodologies for wireless big data n... 2016.0 True True
17 Reducing Data Dimensions for Systems Engineeri... 2014.0 True NaN
18 Twitter Streams Fuel Big Data Approaches to He... 2015.0 True NaN
19 THE LATENT STATE HAZARD MODEL, WITH APPLICATIO... 2015.0 True NaN
20 Deploying and Managing a Network of Autonomous... 2015.0 True NaN
21 The Person-Event Data Environment: leveraging ... 2013.0 True NaN
22 A secure and scalable storage system for aggre... 2015.0 True True
23 MaRDiGraS: Simplified Building of Reachability... 2013.0 True NaN
24 Power System Disaster-Mitigating Dispatch Plat... 2014.0 True NaN
25 A k-anonymity Method based on SEM (Search Engi... 2013.0 True NaN
26 Philosophical Reflections on Data 2014.0 True NaN
27 A Risk and Benefits Behavioral Model to Assess... 2013.0 True NaN
28 Complications of Laryngeal Masks in Children B... 2013.0 True NaN
29 From social data mining to forecasting socio-e... 2011.0 True NaN
... ... ... ... ...
9449 Big data analysis and data velocity 2015.0 NaN True
9450 I/O characteristics and implications of big da... 2015.0 NaN True
9451 Secure distribution of big data based on bitto... 2013.0 NaN True
9452 Modern aspects in development of branch applic... 2015.0 NaN True
9453 Multi-strategy based sina microblog data acqui... 2014.0 NaN True
9454 A novel Cp-Tree-based co-located classifier fo... 2015.0 NaN True
9455 Application of big data technology in support ... 2015.0 NaN True
9456 Real-time effective framework for unstructured... 2013.0 NaN True
9457 Big Data-Security and Privacy 2015.0 NaN True
9458 Research on public opinion based on Big Data 2015.0 NaN True
9459 Locally refined splines representation for geo... 2015.0 NaN True
9460 Big data study for coping with stress 2015.0 NaN True
9461 Digital Data Grows into Big Data 2015.0 NaN True
9462 SAW classification algorithm for Chinese text ... 2015.0 NaN True
9463 Interactive e-science cyberinfrastructure for ... 2015.0 NaN True
9464 Public policy considerations for data-driven i... 2013.0 NaN True
9465 The performance of MapReduce over the varying ... 2013.0 NaN True
9466 Understanding library user engagement strategi... 2015.0 NaN True
9467 Twitter Mining for Discovery, Prediction and C... 2015.0 NaN True
9468 Process optimization and monitoring along big ... 2015.0 NaN True
9469 RSenter: Terms mining tool from unstructured d... 2013.0 NaN True
9470 Resource management in cloud federation using ... 2014.0 NaN True
9471 Design and implementation of a dynamic educati... 2014.0 NaN True
9472 Big data for cyber physical systems an analysi... 2014.0 NaN True
9473 Designing a big data processing platform for a... 2013.0 NaN True
9474 Potential and Pitfalls for Big Data in Health ... 2015.0 NaN True
9475 GridKa school - Teaching information technolog... 2015.0 NaN True
9476 A survey on PCM-based big data storage and man... 2015.0 NaN True
9477 A distributed file system over heterogeneous s... 2015.0 NaN True
9478 Adaptive collaborative filtering based on scal... 2016.0 NaN True

9479 rows × 4 columns

grouping data


In [17]:
grouped=df.groupby(['PUBYEAR_y'])           
df2=grouped.agg('count').reset_index()
df2


Out[17]:
PUBYEAR_y ARTICLE_TITLE wos scopus
0 1995.0 1 1 0
1 2003.0 1 0 1
2 2004.0 1 1 0
3 2005.0 1 0 1
4 2006.0 2 0 2
5 2007.0 1 1 0
6 2008.0 4 3 1
7 2009.0 4 4 1
8 2010.0 7 4 3
9 2011.0 31 10 22
10 2012.0 323 106 228
11 2013.0 1421 570 904
12 2014.0 2652 1048 1789
13 2015.0 4111 1452 3127
14 2016.0 919 322 750

visualize with plotly:

we make three diagrams: 1) a horizontal bar plot comparing the overall papers per db 2) a vertical bar plot differentiating time and db 3) a vertical bar plot differentiating tima and db with a logarithmic y-scale (allows for better inspection of smaller numbers)


In [46]:
#set data for horizontal bar plot:
data = [go.Bar(
            x=[pd.DataFrame.sum(df2)['wos'],pd.DataFrame.sum(df2)['scopus'],pd.DataFrame.sum(df2)['ARTICLE_TITLE']],
            y=['Web of Science', 'Scopus', 'Total'],
            orientation = 'h',
            marker=dict(
                color=colorlist
            )
)]
#py.plot(data, filename='big_data_papers_horizontal') #for uploading to plotly
py.iplot(data, filename='horizontal-bar')


Out[46]:
'https://plot.ly/~mathias.riechert/131'

In [47]:
#set data for stacked bar plot:
trace1 = go.Bar(
    x=df2['PUBYEAR_y'],
    y=df2['wos'],
    name='Web of Science',
    marker=dict(
    color=colorlist[0]
        )
)
trace2 = go.Bar(
    x=df2['PUBYEAR_y'],
    y=df2['scopus'],
    name='Scopus',
    marker=dict(
    color=colorlist[1]
        )

)
trace3 = go.Bar(
    x=df2['PUBYEAR_y'],
    y=df2['ARTICLE_TITLE'],
    name='All Papers',
    marker=dict(
    color=colorlist[2]
        )
)
data = [trace1, trace2,trace3]

In [54]:
#set layout for stacked bar chart with logarithmic y scale:

#set layout for stacked bar chart with normal y scale:
layout_no_log = go.Layout(
    title='Big data papers over time',
    barmode='group',
    xaxis=dict(
        title='year',
        titlefont=dict(
            family='Arial, sans-serif',
            size=14,
            color='lightgrey'
        ),
        tickfont=dict(
            family='Arial, sans-serif',
            size=10,
            color='black'
        ),
        showticklabels=True,
        dtick=1,
        tickangle=45,
    )
)
#plot:
fig1 = go.Figure(data=data, layout=layout_no_log)
py.iplot(fig1, filename='big_data_papers_no_log')

In [44]:
layout_log = go.Layout(
    title='Big data papers over time (log y-scale)',
    barmode='group',
    xaxis=dict(
        title='year',
        titlefont=dict(
            family='Arial, sans-serif',
            size=14,
            color='lightgrey'
        ),
        tickfont=dict(
            family='Arial, sans-serif',
            size=10,
            color='black'
        ),
        showticklabels=True,
        dtick=1,
        tickangle=45,
    ),
    yaxis=dict(
        type='log'
    )
    )
fig2 = go.Figure(data=data, layout=layout_log)
py.iplot(fig2, filename='big_data_papers_log')


Out[44]: